
Quora is a place to gain and share knowledge—about anything. It’s a platform to ask questions and connect with people who contribute unique insights and quality answers. This empowers people to learn from each other and to better understand the world.
Over 100 million people visit Quora every month, so it's no surprise that many people ask similarly worded questions. Multiple questions with the same intent can cause seekers to spend more time finding the best answer to their question, and make writers feel they need to answer multiple versions of the same question. Quora values canonical questions because they provide a better experience to active seekers and writers, and offer more value to both of these groups in the long term.
Credits: Kaggle
Problem Statement
- Data will be in a file Train.csv
- Train.csv contains 5 columns : qid1, qid2, question1, question2, is_duplicate
- Size of Train.csv - 60MB
- Number of rows in Train.csv = 404,290
"id","qid1","qid2","question1","question2","is_duplicate" "0","1","2","What is the step by step guide to invest in share market in india?","What is the step by step guide to invest in share market?","0" "1","3","4","What is the story of Kohinoor (Koh-i-Noor) Diamond?","What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?","0" "7","15","16","How can I be a good geologist?","What should I do to be a great geologist?","1" "11","23","24","How do I read and find my YouTube comments?","How can I see all my Youtube comments?","1"
It is a binary classification problem, for a given pair of questions we need to predict if they are duplicate or not.
Source: https://www.kaggle.com/c/quora-question-pairs#evaluation
Metric(s):
We build train and test by randomly splitting in the ratio of 70:30 or 80:20 whatever we choose as we have sufficient points to work with.
import warnings
warnings.filterwarnings("ignore")
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
from tqdm import tqdm
import os
from chart_studio.plotly import plotly
import plotly.graph_objs as go
#offline.init_notebook_mode()
from collections import Counter
from subprocess import check_output
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.tools as tls
import gc
import distance
from bs4 import BeautifulSoup
df = pd.read_csv("train.csv")
print("Number of data points:",df.shape[0])
df.head()
df.info()
We are given a minimal number of data fields here, consisting of:
df.groupby("is_duplicate")['id'].count().plot.bar()
print('~> Total number of question pairs for training:\n {}'.format(len(df)))
print('~> Question pairs are not Similar (is_duplicate = 0):\n {}%'.format(100 - round(df['is_duplicate'].mean()*100, 2)))
print('\n~> Question pairs are Similar (is_duplicate = 1):\n {}%'.format(round(df['is_duplicate'].mean()*100, 2)))
qids = pd.Series(df['qid1'].tolist() + df['qid2'].tolist())
unique_qs = len(np.unique(qids))
qs_morethan_onetime = np.sum(qids.value_counts() > 1)
print ('Total number of Unique Questions are: {}\n'.format(unique_qs))
#print len(np.unique(qids))
print ('Number of unique questions that appear more than one time: {} ({}%)\n'.format(qs_morethan_onetime,qs_morethan_onetime/unique_qs*100))
print ('Max number of times a single question is repeated: {}\n'.format(max(qids.value_counts())))
q_vals=qids.value_counts()
q_vals=q_vals.values
x = ["unique_questions" , "Repeated Questions"]
y = [unique_qs , qs_morethan_onetime]
plt.figure(figsize=(10, 6))
plt.title ("Plot representing unique and repeated questions ")
sns.barplot(x,y)
plt.show()
#checking whether there are any repeated pair of questions
pair_duplicates = df[['qid1','qid2','is_duplicate']].groupby(['qid1','qid2']).count().reset_index()
print ("Number of duplicate questions",(pair_duplicates).shape[0] - df.shape[0])
plt.figure(figsize=(20, 10))
plt.hist(qids.value_counts(), bins=160)
plt.yscale('log', nonposy='clip')
plt.title('Log-Histogram of question appearance counts')
plt.xlabel('Number of occurences of question')
plt.ylabel('Number of questions')
print ('Maximum number of times a single question is repeated: {}\n'.format(max(qids.value_counts())))
#Checking whether there are any rows with null values
nan_rows = df[df.isnull().any(1)]
print (nan_rows)
# Filling the null values with ' '
df = df.fillna('')
nan_rows = df[df.isnull().any(1)]
print (nan_rows)
Let us now construct a few features like:
if os.path.isfile('df_fe_without_preprocessing_train.csv'):
df = pd.read_csv("df_fe_without_preprocessing_train.csv",encoding='latin-1')
else:
df['freq_qid1'] = df.groupby('qid1')['qid1'].transform('count')
df['freq_qid2'] = df.groupby('qid2')['qid2'].transform('count')
df['q1len'] = df['question1'].str.len()
df['q2len'] = df['question2'].str.len()
df['q1_n_words'] = df['question1'].apply(lambda row: len(row.split(" ")))
df['q2_n_words'] = df['question2'].apply(lambda row: len(row.split(" ")))
def normalized_word_Common(row):
w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
return 1.0 * len(w1 & w2)
df['word_Common'] = df.apply(normalized_word_Common, axis=1)
def normalized_word_Total(row):
w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
return 1.0 * (len(w1) + len(w2))
df['word_Total'] = df.apply(normalized_word_Total, axis=1)
def normalized_word_share(row):
w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
return 1.0 * len(w1 & w2)/(len(w1) + len(w2))
df['word_share'] = df.apply(normalized_word_share, axis=1)
df['freq_q1+q2'] = df['freq_qid1']+df['freq_qid2']
df['freq_q1-q2'] = abs(df['freq_qid1']-df['freq_qid2'])
df.to_csv("df_fe_without_preprocessing_train.csv", index=False)
df.head()
df.shape #11 features have been added
print ("Minimum length of the questions in question1 : " , min(df['q1_n_words']))
print ("Minimum length of the questions in question2 : " , min(df['q2_n_words']))
print ("Number of Questions with minimum length [question1] :", df[df['q1_n_words']== 1].shape[0])
print ("Number of Questions with minimum length [question2] :", df[df['q2_n_words']== 1].shape[0])
plt.figure(figsize=(12, 8))
plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'word_share', data = df[0:])
plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate'] == 1.0]['word_share'][0:] , label = "1", color = 'red')
sns.distplot(df[df['is_duplicate'] == 0.0]['word_share'][0:] , label = "0" , color = 'blue' )
plt.legend()
plt.show()
plt.figure(figsize=(12, 8))
plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'word_Common', data = df[0:])
plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate'] == 1.0]['word_Common'][0:] , label = "1", color = 'red')
sns.distplot(df[df['is_duplicate'] == 0.0]['word_Common'][0:] , label = "0" , color = 'blue' )
plt.show()
The distributions of the word_Common feature in similar and non-similar questions are highly overlapping
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import check_output
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import os
import gc
import re
from nltk.corpus import stopwords
import distance
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
# This package is used for finding longest common subsequence between two strings
# you can write your own dp code for this
import distance
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from sklearn.manifold import TSNE
# Import the Required lib packages for WORD-Cloud generation
# https://stackoverflow.com/questions/45625434/how-to-install-wordcloud-in-python3-6
from wordcloud import WordCloud, STOPWORDS
from os import path
from PIL import Image
df.head(2)
# To get the results in 4 decemal points
SAFE_DIV = 0.0001
STOP_WORDS = stopwords.words("english")
def preprocess(x):
x = str(x).lower()
#Expanding contractions
x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
.replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
.replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
.replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
.replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
.replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
.replace("€", " euro ").replace("'ll", " will")
x = re.sub(r"([0-9]+)000000", r"\1m", x)
x = re.sub(r"([0-9]+)000", r"\1k", x)
porter = PorterStemmer()
pattern = re.compile('\W')
if type(x) == type(''):
x = re.sub(pattern, ' ', x)
if type(x) == type(''):
x = porter.stem(x)
example1 = BeautifulSoup(x)
x = example1.get_text()
return x
Definition:
Features:
ctc_min : Ratio of common_token_count to min length of token count of Q1 and Q2
ctc_min = common_token_count / (min(len(q1_tokens), len(q2_tokens))
ctc_max : Ratio of common_token_count to max length of token count of Q1 and Q2
ctc_max = common_token_count / (max(len(q1_tokens), len(q2_tokens))
last_word_eq : Check if Last word of both questions is equal or not
last_word_eq = int(q1_tokens[-1] == q2_tokens[-1])
first_word_eq : Check if First word of both questions is equal or not
first_word_eq = int(q1_tokens[0] == q2_tokens[0])
abs_len_diff : Abs. length difference
abs_len_diff = abs(len(q1_tokens) - len(q2_tokens))
mean_len : Average Token Length of both Questions
mean_len = (len(q1_tokens) + len(q2_tokens))/2
fuzz_ratio : https://github.com/seatgeek/fuzzywuzzy#usage
http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
fuzz_partial_ratio : https://github.com/seatgeek/fuzzywuzzy#usage
http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
def get_token_features(q1, q2):
token_features = [0.0]*10
# Converting the Sentence into Tokens:
q1_tokens = q1.split()
q2_tokens = q2.split()
if len(q1_tokens) == 0 or len(q2_tokens) == 0:
return token_features
# Get the non-stopwords in Questions
q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
#Get the stopwords in Questions
q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
# Get the common non-stopwords from Question pair
common_word_count = len(q1_words.intersection(q2_words))
# Get the common stopwords from Question pair
common_stop_count = len(q1_stops.intersection(q2_stops))
# Get the common Tokens from Question pair
common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
# Last word of both question is same or not
token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
# First word of both question is same or not
token_features[7] = int(q1_tokens[0] == q2_tokens[0])
token_features[8] = abs(len(q1_tokens) - len(q2_tokens))
#Average Token Length of both Questions
token_features[9] = (len(q1_tokens) + len(q2_tokens))/2
return token_features
# get the Longest Common sub string
def get_longest_substr_ratio(a, b):
strs = list(distance.lcsubstrings(a, b))
if len(strs) == 0:
return 0
else:
return len(strs[0]) / (min(len(a), len(b)) + 1)
def extract_features(df):
# preprocessing each question
df["question1"] = df["question1"].fillna("").apply(preprocess)
df["question2"] = df["question2"].fillna("").apply(preprocess)
print("token features...")
# Merging Features with dataset
token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
df["cwc_min"] = list(map(lambda x: x[0], token_features))
df["cwc_max"] = list(map(lambda x: x[1], token_features))
df["csc_min"] = list(map(lambda x: x[2], token_features))
df["csc_max"] = list(map(lambda x: x[3], token_features))
df["ctc_min"] = list(map(lambda x: x[4], token_features))
df["ctc_max"] = list(map(lambda x: x[5], token_features))
df["last_word_eq"] = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))
df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
df["mean_len"] = list(map(lambda x: x[9], token_features))
#Computing Fuzzy Features and Merging with Dataset
# do read this blog: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
# https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings
# https://github.com/seatgeek/fuzzywuzzy
print("fuzzy features..")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
# The token sort approach involves tokenizing the string in question, sorting the tokens alphabetically, and
# then joining them back into a string We then compare the transformed strings with a simple ratio().
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
return df
if os.path.isfile('nlp_features_train.csv'):
df = pd.read_csv("nlp_features_train.csv",encoding='latin-1')
df.fillna('')
else:
print("Extracting features for train:")
df = pd.read_csv("train.csv")
df = extract_features(df)
df.to_csv("nlp_features_train.csv", index=False)
df.head(2)
df.columns
df.shape
df_duplicate = df[df['is_duplicate'] == 1]
dfp_nonduplicate = df[df['is_duplicate'] == 0]
# Converting 2d array of q1 and q2 and flatten the array: like {{1,2},{3,4}} to {1,2,3,4}
p = np.dstack([df_duplicate["question1"], df_duplicate["question2"]]).flatten()
n = np.dstack([dfp_nonduplicate["question1"], dfp_nonduplicate["question2"]]).flatten()
#https://stackoverflow.com/questions/25116595/understanding-numpys-dstack-function
print ("Number of data points in class 1 (duplicate pairs) :",len(p))
print ("Number of data points in class 0 (non duplicate pairs) :",len(n))
#Saving the np array into a text file
#np.savetxt('train_p.txt', p, delimiter=' ', fmt='%s')
#np.savetxt('train_n.txt', n, delimiter=' ', fmt='%s')
# reading the text files and removing the Stop Words:
d = path.dirname('.')
textp_w = open(path.join(d, 'train_p.txt')).read()
textn_w = open(path.join(d, 'train_n.txt')).read()
stopwords = set(STOPWORDS)
stopwords.add("said")
stopwords.add("br")
stopwords.add(" ")
stopwords.remove("not")
stopwords.remove("no")
#stopwords.remove("good")
#stopwords.remove("love")
stopwords.remove("like")
#stopwords.remove("best")
#stopwords.remove("!")
print ("Total number of words in duplicate pair questions :",len(textp_w))
print ("Total number of words in non duplicate pair questions :",len(textn_w))
Word Clouds generated from duplicate pair question's text
wc = WordCloud(background_color="white", max_words=len(textp_w), stopwords=stopwords)
wc.generate(textp_w)
print ("Word Cloud for Duplicate Question pairs")
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
Word Clouds generated from non duplicate pair question's text
wc = WordCloud(background_color="white", max_words=len(textn_w),stopwords=stopwords)
# generate word cloud
wc.generate(textn_w)
print ("Word Cloud for non-Duplicate Question pairs:")
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
n = df.shape[0]
sns.pairplot(df[['ctc_min', 'cwc_min', 'csc_min', 'token_sort_ratio', 'is_duplicate']][0:n], hue='is_duplicate', vars=['ctc_min', 'cwc_min', 'csc_min', 'token_sort_ratio'])
plt.show()
# Distribution of the token_sort_ratio
plt.figure(figsize=(10, 8))
plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'token_sort_ratio', data = df[0:] , )
plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate'] == 1.0]['token_sort_ratio'][0:] , label = "1", color = 'red')
sns.distplot(df[df['is_duplicate'] == 0.0]['token_sort_ratio'][0:] , label = "0" , color = 'blue' )
plt.show()
plt.figure(figsize=(10, 8))
plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'fuzz_ratio', data = df[0:] , )
plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate'] == 1.0]['fuzz_ratio'][0:] , label = "1", color = 'red')
sns.distplot(df[df['is_duplicate'] == 0.0]['fuzz_ratio'][0:] , label = "0" , color = 'blue' )
plt.show()
# Using TSNE for Dimentionality reduction for 15 Features(Generated after cleaning the data) to 3 dimention
from sklearn.preprocessing import MinMaxScaler
dfp_subsampled = df[0:5000]
X = MinMaxScaler().fit_transform(dfp_subsampled[['cwc_min', 'cwc_max', 'csc_min', 'csc_max' , 'ctc_min' , 'ctc_max' , 'last_word_eq', 'first_word_eq' , 'abs_len_diff' , 'mean_len' , 'token_set_ratio' , 'token_sort_ratio' , 'fuzz_ratio' , 'fuzz_partial_ratio' , 'longest_substr_ratio']])
y = dfp_subsampled['is_duplicate'].values
tsne2d = TSNE(
n_components=2,
init='random', # pca
random_state=101,
method='barnes_hut',
n_iter=1000,
verbose=2,
angle=0.5
).fit_transform(X)
df = pd.DataFrame({'x':tsne2d[:,0], 'y':tsne2d[:,1] ,'label':y})
# draw the plot in appropriate place in the grid
sns.lmplot(data=df, x='x', y='y', hue='label', fit_reg=False, size=8,palette="Set1",markers=['s','o'])
plt.title("perplexity : {} and max_iter : {}".format(30, 1000))
plt.show()
from sklearn.manifold import TSNE
tsne3d = TSNE(
n_components=3,
init='random', # pca
random_state=101,
method='barnes_hut',
n_iter=1000,
verbose=2,
angle=0.5
).fit_transform(X)
trace1 = go.Scatter3d(
x=tsne3d[:,0],
y=tsne3d[:,1],
z=tsne3d[:,2],
mode='markers',
marker=dict(
sizemode='diameter',
color = y,
colorscale = 'Portland',
colorbar = dict(title = 'duplicate'),
line=dict(color='rgb(255, 255, 255)'),
opacity=0.75
)
)
data=[trace1]
layout=dict(height=800, width=800, title='3d embedding with engineered features')
fig=dict(data=data, layout=layout)
py.iplot(fig, filename='3DBubble')
df_basic=pd.read_csv("df_fe_without_preprocessing_train.csv",encoding='latin-1')
df_basic.head(2)
df_basic.columns
df_advanced=pd.read_csv("nlp_features_train.csv",encoding='latin-1')
df_advanced.head(2)
df_advanced.columns
df1=df_basic.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df1.head()
df1.shape
data=df_advanced.merge(df1, on='id',how='left')
data.head(2)
data.shape
export_csv = data.to_csv (r'D:\PGS\Applied AI course\E-Notes\Module_6-Real World Case studies\Case Study 1- Quora question Pair Similarity Problem\my_data.csv', index = None, header=True)
data = pd.read_csv('my_data.csv', nrows=100000)
data.head(2)
Y=data['is_duplicate']
X=data.drop(['qid1','qid2','is_duplicate'],axis=1)
print(X.shape)
print(Y.shape)
X.head(2)
# Random train test split( 70:30)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)
print("Number of data points in train data :",X_train.shape)
print("Number of data points in test data :",X_test.shape)
print("-"*10, "Distribution of output variable in train data", "-"*10)
train_distr = Counter(y_train)
train_len = len(y_train)
print("Class 0: ",int(train_distr[0])/train_len,"Class 1: ", int(train_distr[1])/train_len)
print("-"*10, "Distribution of output variable in train data", "-"*10)
test_distr = Counter(y_test)
test_len = len(y_test)
print("Class 0: ",int(test_distr[1])/test_len, "Class 1: ",int(test_distr[1])/test_len)
def scaler(train_column,test_column):
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(train_column.values.reshape(-1,1))
train_norm = normalizer.transform(train_column.values.reshape(-1,1))
test_norm = normalizer.transform(test_column.values.reshape(-1,1))
return train_norm,test_norm
X_train_cwc_min_norm,X_test_cwc_min_norm = scaler(X_train['cwc_min'],X_test['cwc_min'])
print("After vectorizations")
print(X_train_cwc_min_norm.shape, y_train.shape)
print(X_test_cwc_min_norm.shape, y_test.shape)
print("="*100)
X_train_cwc_max_norm,X_test_cwc_max_norm = scaler(X_train['cwc_max'],X_test['cwc_max'])
X_train_csc_min_norm,X_test_csc_min_norm = scaler(X_train['csc_min'],X_test['csc_min'])
X_train_csc_max_norm,X_test_csc_max_norm = scaler(X_train['csc_max'],X_test['csc_max'])
X_train_ctc_min_norm,X_test_ctc_min_norm = scaler(X_train['ctc_min'],X_test['ctc_min'])
X_train_ctc_max_norm,X_test_ctc_max_norm = scaler(X_train['ctc_max'],X_test['ctc_max'])
X_train_last_word_eq_norm,X_test_last_word_eq_norm = scaler(X_train['last_word_eq'],X_test['last_word_eq'])
X_train_first_word_eq_norm,X_test_first_word_eq_norm = scaler(X_train['first_word_eq'],X_test['first_word_eq'])
X_train_abs_len_diff_norm,X_test_abs_len_diff_norm = scaler(X_train['abs_len_diff'],X_test['abs_len_diff'])
X_train_mean_len_norm,X_test_mean_len_norm = scaler(X_train['mean_len'],X_test['mean_len'])
X_train_token_set_ratio_norm,X_test_token_set_ratio_norm = scaler(X_train['token_set_ratio'],X_test['token_set_ratio'])
X_train_token_sort_ratio_norm,X_test_token_sort_ratio_norm = scaler(X_train['token_sort_ratio'],X_test['token_sort_ratio'])
X_train_fuzz_ratio_norm,X_test_fuzz_ratio_norm = scaler(X_train['fuzz_ratio'],X_test['fuzz_ratio'])
X_train_fuzz_partial_ratio_norm,X_test_fuzz_partial_ratio_norm = scaler(X_train['fuzz_partial_ratio'],X_test['fuzz_partial_ratio'])
X_train_longest_substr_ratio_norm,X_test_longest_substr_ratio_norm = scaler(X_train['longest_substr_ratio'],X_test['longest_substr_ratio'])
X_train_freq_qid1_norm,X_test_freq_qid1_norm = scaler(X_train['freq_qid1'],X_test['freq_qid1'])
print("After vectorizations")
print(X_train_freq_qid1_norm.shape, y_train.shape)
print(X_test_freq_qid1_norm.shape, y_test.shape)
print("="*100)
X_train.columns
X_train_freq_qid2_norm,X_test_freq_qid2_norm = scaler(X_train['freq_qid2'],X_test['freq_qid2'])
X_train_q1len_norm,X_test_q1len_norm = scaler(X_train['q1len'],X_test['q1len'])
X_train_q2len_norm,X_test_q2len_norm = scaler(X_train['q2len'],X_test['q2len'])
X_train_q1_n_words_norm,X_test_q1_n_words_norm = scaler(X_train['q1_n_words'],X_test['q1_n_words'])
X_train_q2_n_words_norm,X_test_q2_n_words_norm = scaler(X_train['q2_n_words'],X_test['q2_n_words'])
X_train_word_Common_norm,X_test_word_Common_norm = scaler(X_train['word_Common'],X_test['word_Common'])
X_train_word_Total_norm,X_test_word_Total_norm = scaler(X_train['word_Total'],X_test['word_Total'])
X_train_word_share_norm,X_test_word_share_norm = scaler(X_train['word_share'],X_test['word_share'])
X_train_freq_q1addq2_norm,X_test_freq_q1addq2_norm = scaler(X_train['freq_q1+q2'],X_test['freq_q1+q2'])
X_train_freq_q1subq2_norm,X_test_freq_q1subq2_norm = scaler(X_train['freq_q1-q2'],X_test['freq_q1-q2'])
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer1 = TfidfVectorizer(min_df=10,ngram_range=(1,2), max_features=3000)
vectorizer1.fit(X_train['question1'].values.astype('U'))
X_train_q1_tfidf = vectorizer1.transform(X_train['question1'].values.astype('U'))
X_test_q1_tfidf = vectorizer1.transform(X_test['question1'].values.astype('U'))
f1=vectorizer1.get_feature_names()
print("After vectorization")
print(X_train_q1_tfidf.shape, y_train.shape)
print(X_test_q1_tfidf.shape, y_test.shape)
print("="*100)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer2 = TfidfVectorizer(min_df=10,ngram_range=(1,2), max_features=3000)
vectorizer2.fit(X_train['question2'].values.astype('U'))
X_train_q2_tfidf = vectorizer2.transform(X_train['question2'].values.astype('U'))
X_test_q2_tfidf = vectorizer2.transform(X_test['question2'].values.astype('U'))
print("After vectorization")
print(X_train_q2_tfidf.shape, y_train.shape)
print(X_test_q2_tfidf.shape, y_test.shape)
print("="*100)
#for train data
from scipy.sparse import coo_matrix,hstack
X_tr_tfidf = hstack((X_train_q1_tfidf,X_train_q2_tfidf,X_train_cwc_min_norm,X_train_cwc_max_norm,X_train_csc_min_norm,X_train_csc_max_norm,
X_train_ctc_min_norm,X_train_ctc_max_norm,X_train_last_word_eq_norm,X_train_first_word_eq_norm,
X_train_abs_len_diff_norm,X_train_mean_len_norm,X_train_token_set_ratio_norm,
X_train_token_sort_ratio_norm,X_train_fuzz_ratio_norm,X_train_fuzz_partial_ratio_norm,
X_train_longest_substr_ratio_norm,X_train_freq_qid1_norm,X_train_freq_qid2_norm,X_train_q1len_norm,X_train_q2len_norm,
X_train_q1_n_words_norm,X_train_q2_n_words_norm,X_train_word_Common_norm,X_train_word_Total_norm,
X_train_word_share_norm,X_train_freq_q1addq2_norm,X_train_freq_q1subq2_norm,)).tocsr()
#for test data
X_test_tfidf = hstack((X_test_q1_tfidf,X_test_q2_tfidf,X_test_cwc_min_norm,X_test_cwc_max_norm,X_test_csc_min_norm,X_test_csc_max_norm,
X_test_ctc_min_norm,X_test_ctc_max_norm,X_test_last_word_eq_norm,X_test_first_word_eq_norm,
X_test_abs_len_diff_norm,X_test_mean_len_norm,X_test_token_set_ratio_norm,
X_test_token_sort_ratio_norm,X_test_fuzz_ratio_norm,X_test_fuzz_partial_ratio_norm,
X_test_longest_substr_ratio_norm,X_test_freq_qid1_norm,X_test_freq_qid2_norm,X_test_q1len_norm,X_test_q2len_norm,
X_test_q1_n_words_norm,X_test_q2_n_words_norm,X_test_word_Common_norm,X_test_word_Total_norm,
X_test_word_share_norm,X_test_freq_q1addq2_norm,X_test_freq_q1subq2_norm,)).tocsr()
print("Final Data Matrix")
print(X_tr_tfidf.shape, y_train.shape)
print(X_test_tfidf.shape, y_test.shape)
# https://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format
from scipy import sparse
sparse.save_npz("X_tr_tfidf.npz", X_tr_tfidf)
sparse.save_npz("X_test_tfidf.npz", X_test_tfidf)
#https://www.geeksforgeeks.org/numpy-save/
np.save('y_train', y_train.values)
np.save('y_test', y_test.values)
# This function plots the confusion matrices given y_i, y_i_hat.
def plot_confusion_matrix(test_y, predict_y):
C = confusion_matrix(test_y, predict_y)
# C = 9,9 matrix, each cell (i,j) represents number of points of class i are predicted class j
A =(((C.T)/(C.sum(axis=1))).T)
#divid each element of the confusion matrix with the sum of elements in that column
# C = [[1, 2],
# [3, 4]]
# C.T = [[1, 3],
# [2, 4]]
# C.sum(axis = 1) axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
# C.sum(axix =1) = [[3, 7]]
# ((C.T)/(C.sum(axis=1))) = [[1/3, 3/7]
# [2/3, 4/7]]
# ((C.T)/(C.sum(axis=1))).T = [[1/3, 2/3]
# [3/7, 4/7]]
# sum of row elements = 1
B =(C/C.sum(axis=0))
#divid each element of the confusion matrix with the sum of elements in that row
# C = [[1, 2],
# [3, 4]]
# C.sum(axis = 0) axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
# C.sum(axix =0) = [[4, 6]]
# (C/C.sum(axis=0)) = [[1/4, 2/6],
# [3/4, 4/6]]
plt.figure(figsize=(20,4))
labels = [1,2]
# representing A in heatmap format
cmap=sns.light_palette("blue")
plt.subplot(1, 3, 1)
sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.title("Confusion matrix")
plt.subplot(1, 3, 2)
sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.title("Precision matrix")
plt.subplot(1, 3, 3)
# representing B in heatmap format
sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.title("Recall matrix")
plt.show()
# we need to generate 9 numbers and the sum of numbers should be 1
# one solution is to genarate 9 numbers and divide each of the numbers by their sum
# ref: https://stackoverflow.com/a/18662466/4084039
# we create a output array that has exactly same size as the CV data
from sklearn.metrics.classification import accuracy_score, log_loss
test_len = len(y_test)
predicted_y = np.zeros((test_len,2))
for i in range(test_len):
rand_probs = np.random.rand(1,2)
#https://www.geeksforgeeks.org/numpy-random-rand-python/
predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
print("Log loss on Test Data using Random Model",log_loss(y_test, predicted_y, eps=1e-15))
predicted_y =np.argmax(predicted_y, axis=1)
plot_confusion_matrix(y_test, predicted_y)
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
alpha = [10 ** x for x in range(-5, 4,)] # hyperparam for SGD classifier.
# read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# ------------------------------
# default parameters
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
# shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5,
# class_weight=None, warm_start=False, average=False, n_iter=None)
# some of methods
# fit(X, y[, coef_init, intercept_init, …]) Fit linear model with Stochastic Gradient Descent.
# predict(X) Predict class labels for samples in X.
#-------------------------------
# video link:
#------------------------------
log_error_array=[]
for i in alpha:
clf = SGDClassifier(alpha=i, penalty='l2', loss='log', class_weight='balanced',random_state=42)
clf.fit(X_tr_tfidf, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_tr_tfidf, y_train)
predict_y = sig_clf.predict_proba(X_test_tfidf)
log_error_array.append(log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
print('For values of alpha = ', i, "The log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
fig, ax = plt.subplots()
ax.plot(alpha, log_error_array,c='g')
for i, txt in enumerate(np.round(log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
best_alpha = np.argmin(log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', class_weight='balanced', random_state=42)
clf.fit(X_tr_tfidf, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_tr_tfidf, y_train)
predict_y = sig_clf.predict_proba(X_tr_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_test_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
predicted_y =np.argmax(predict_y,axis=1)
print("Total number of data points :", len(predicted_y))
plot_confusion_matrix(y_test, predicted_y)
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
alpha = [10 ** x for x in range(-5, 4,)] # hyperparam for SGD classifier.
# read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# ------------------------------
# default parameters
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
# shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5,
# class_weight=None, warm_start=False, average=False, n_iter=None)
# some of methods
# fit(X, y[, coef_init, intercept_init, …]) Fit linear model with Stochastic Gradient Descent.
# predict(X) Predict class labels for samples in X.
#-------------------------------
# video link:
#------------------------------
log_error_array=[]
for i in alpha:
clf = SGDClassifier(alpha=i, penalty='l2', loss='hinge', class_weight='balanced',random_state=42)
clf.fit(X_tr_tfidf, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_tr_tfidf, y_train)
predict_y = sig_clf.predict_proba(X_test_tfidf)
log_error_array.append(log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
print('For values of alpha = ', i, "The log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
fig, ax = plt.subplots()
ax.plot(alpha, log_error_array,c='g')
for i, txt in enumerate(np.round(log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
best_alpha = np.argmin(log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='hinge', class_weight='balanced', random_state=42)
clf.fit(X_tr_tfidf, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_tr_tfidf, y_train)
predict_y = sig_clf.predict_proba(X_tr_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_test_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
predicted_y =np.argmax(predict_y,axis=1)
print("Total number of data points :", len(predicted_y))
plot_confusion_matrix(y_test, predicted_y)
from scipy import sparse
import numpy as np
X_tr_tfidf= sparse.load_npz("X_tr_tfidf.npz")
X_test_tfidf = sparse.load_npz("X_test_tfidf.npz")
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')
#https://dask-ml.readthedocs.io/en/stable/modules/generated/dask_ml.xgboost.XGBClassifier.html
#https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
xgb = XGBClassifier()
parameters = {'n_estimators': [4, 8, 16, 32, 64],'max_depth': [4, 6, 8, 10, 12]}
clf1 = RandomizedSearchCV(xgb, parameters, cv=5, scoring='neg_log_loss',return_train_score=True,n_jobs=-1)
rs1 = clf1.fit(X_tr_tfidf, y_train)
df=pd.DataFrame(clf1.cv_results_)
df.head(2)
df.to_csv(r'HYP.csv')
df = pd.read_csv("HYP.csv")
%matplotlib inline
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
import numpy as np
def enable_plotly_in_cell():
import IPython
from plotly.offline import init_notebook_mode
display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
init_notebook_mode(connected=False)
# https://plot.ly/python/3d-axes/
trace1 = go.Scatter3d(x=df['param_n_estimators'],y=df['param_max_depth'],z=df['mean_train_score'], name = 'train')
trace2 = go.Scatter3d(x=df['param_n_estimators'],y=df['param_max_depth'],z=df['mean_test_score'], name = 'Cross validation')
data = [trace1, trace2]
enable_plotly_in_cell()
layout = go.Layout(scene = dict(
xaxis = dict(title='Estimators'),
yaxis = dict(title='Max_depth'),
zaxis = dict(title='Log loss'),))
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='3d-scatter-colorscale')
best_parameters = {'n_estimators': [64],'max_depth': [6]}
# find train & test log loss for the best hyperparameters [:,1]
xg_best= XGBClassifier(n_estimators= 64 , max_depth= 6)
xg_best.fit(X_tr_tfidf, y_train)
probs_train= xg_best.predict_proba(X_tr_tfidf)
probs_test= xg_best.predict_proba(X_test_tfidf)
y_pred_train= xg_best.predict(X_tr_tfidf)
y_pred_test= xg_best.predict(X_test_tfidf)
print("The train log loss for the best hyperparameters is:",log_loss(y_train, probs_train, eps=1e-15))
print("The test log loss for the best hyperparameters is:",log_loss(y_test, probs_test, eps=1e-15))
plot_confusion_matrix(y_test, y_pred_test)
# pretrained glove model
with open('C:\\Users\\Admin\\Assignments and case studies\\Mandatory\\Assignment 7-SVM on donors choose\\glove_vectors', 'rb') as f:
model = pickle.load(f)
glove_words = set(model.keys())
print ("Done.",len(model)," words loaded!")
X_train.columns
X_train['question1'] = X_train['question1'].apply(lambda x: str(x))
X_train['question2'] = X_train['question2'].apply(lambda x: str(x))
X_test['question1'] = X_test['question1'].apply(lambda x: str(x))
X_test['question2'] = X_test['question2'].apply(lambda x: str(x))
#fitting the tfidf model only on train data to prevent data leakage
tfidf_model = TfidfVectorizer()
tfidf_model.fit(X_train['question1'].values.astype('U'))
#we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words_q1 = set(tfidf_model.get_feature_names())
# For train data
# average Word2Vec using pretrained models
# compute average word2vec for each review.
tfidf_w2v_train_q1 = [] # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_train['question1']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words_q1):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
tfidf_w2v_train_q1.append(vector)
print(len(tfidf_w2v_train_q1))
print(len(tfidf_w2v_train_q1[0]))
# For test data
tfidf_w2v_test_q1 = [] # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_test['question1']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words_q1):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
tfidf_w2v_test_q1.append(vector)
print(len(tfidf_w2v_test_q1))
print(len(tfidf_w2v_test_q1[0]))
#fitting the tfidf model only on train data to prevent data leakage
tfidf_model = TfidfVectorizer()
tfidf_model.fit(X_train['question2'].values.astype('U'))
#we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words_q2 = set(tfidf_model.get_feature_names())
# For train data
# average Word2Vec using pretrained models
# compute average word2vec for each review.
tfidf_w2v_train_q2 = [] # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_train['question2']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words_q2):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
tfidf_w2v_train_q2.append(vector)
print(len(tfidf_w2v_train_q2))
print(len(tfidf_w2v_train_q2[0]))
# For test data
tfidf_w2v_test_q2 = [] # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_test['question2']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words_q2):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
tfidf_w2v_test_q2.append(vector)
print(len(tfidf_w2v_test_q2))
print(len(tfidf_w2v_test_q2[0]))
#for train data
from scipy.sparse import coo_matrix,hstack
X_tr_tfidf_w2v = hstack((tfidf_w2v_train_q1,tfidf_w2v_train_q2,coo_matrix(X_train_cwc_min_norm),coo_matrix(X_train_cwc_max_norm),coo_matrix(X_train_csc_min_norm),coo_matrix(X_train_csc_max_norm),
coo_matrix(X_train_ctc_min_norm),coo_matrix(X_train_ctc_max_norm),coo_matrix(X_train_last_word_eq_norm),coo_matrix(X_train_first_word_eq_norm),
coo_matrix(X_train_abs_len_diff_norm),coo_matrix(X_train_mean_len_norm),coo_matrix(X_train_token_set_ratio_norm),
coo_matrix(X_train_token_sort_ratio_norm),coo_matrix(X_train_fuzz_ratio_norm),coo_matrix(X_train_fuzz_partial_ratio_norm),
coo_matrix(X_train_longest_substr_ratio_norm),coo_matrix(X_train_freq_qid1_norm),coo_matrix(X_train_freq_qid2_norm),coo_matrix(X_train_q1len_norm),coo_matrix(X_train_q2len_norm),
coo_matrix(X_train_q1_n_words_norm),coo_matrix(X_train_q2_n_words_norm),coo_matrix(X_train_word_Common_norm),coo_matrix(X_train_word_Total_norm),
coo_matrix(X_train_word_share_norm),coo_matrix(X_train_freq_q1addq2_norm),coo_matrix(X_train_freq_q1subq2_norm))).tocsr()
#for test data
X_test_tfidf_w2v = hstack((tfidf_w2v_test_q1,tfidf_w2v_test_q2,coo_matrix(X_test_cwc_min_norm),coo_matrix(X_test_cwc_max_norm),coo_matrix(X_test_csc_min_norm),coo_matrix(X_test_csc_max_norm),
coo_matrix(X_test_ctc_min_norm),coo_matrix(X_test_ctc_max_norm),coo_matrix(X_test_last_word_eq_norm),coo_matrix(X_test_first_word_eq_norm),
coo_matrix(X_test_abs_len_diff_norm),coo_matrix(X_test_mean_len_norm),coo_matrix(X_test_token_set_ratio_norm),
coo_matrix(X_test_token_sort_ratio_norm),coo_matrix(X_test_fuzz_ratio_norm),coo_matrix(X_test_fuzz_partial_ratio_norm),
coo_matrix(X_test_longest_substr_ratio_norm),coo_matrix(X_test_freq_qid1_norm),coo_matrix(X_test_freq_qid2_norm),coo_matrix(X_test_q1len_norm),coo_matrix(X_test_q2len_norm),
coo_matrix(X_test_q1_n_words_norm),coo_matrix(X_test_q2_n_words_norm),coo_matrix(X_test_word_Common_norm),coo_matrix(X_test_word_Total_norm),
coo_matrix(X_test_word_share_norm),coo_matrix(X_test_freq_q1addq2_norm),coo_matrix(X_test_freq_q1subq2_norm))).tocsr()
print("Final Data Matrix")
print(X_tr_tfidf_w2v.shape, y_train.shape)
print(X_test_tfidf_w2v.shape, y_test.shape)
# https://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format
from scipy import sparse
sparse.save_npz("X_tr_tfidf_w2v.npz", X_tr_tfidf_w2v)
sparse.save_npz("X_test_tfidf_w2v.npz", X_test_tfidf_w2v)
# This function plots the confusion matrices given y_i, y_i_hat.
def plot_confusion_matrix(test_y, predict_y):
C = confusion_matrix(test_y, predict_y)
# C = 9,9 matrix, each cell (i,j) represents number of points of class i are predicted class j
A =(((C.T)/(C.sum(axis=1))).T)
#divid each element of the confusion matrix with the sum of elements in that column
# C = [[1, 2],
# [3, 4]]
# C.T = [[1, 3],
# [2, 4]]
# C.sum(axis = 1) axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
# C.sum(axix =1) = [[3, 7]]
# ((C.T)/(C.sum(axis=1))) = [[1/3, 3/7]
# [2/3, 4/7]]
# ((C.T)/(C.sum(axis=1))).T = [[1/3, 2/3]
# [3/7, 4/7]]
# sum of row elements = 1
B =(C/C.sum(axis=0))
#divid each element of the confusion matrix with the sum of elements in that row
# C = [[1, 2],
# [3, 4]]
# C.sum(axis = 0) axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
# C.sum(axix =0) = [[4, 6]]
# (C/C.sum(axis=0)) = [[1/4, 2/6],
# [3/4, 4/6]]
plt.figure(figsize=(20,4))
labels = [1,2]
# representing A in heatmap format
cmap=sns.light_palette("blue")
plt.subplot(1, 3, 1)
sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.title("Confusion matrix")
plt.subplot(1, 3, 2)
sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.title("Precision matrix")
plt.subplot(1, 3, 3)
# representing B in heatmap format
sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.title("Recall matrix")
plt.show()
# we need to generate 9 numbers and the sum of numbers should be 1
# one solution is to genarate 9 numbers and divide each of the numbers by their sum
# ref: https://stackoverflow.com/a/18662466/4084039
# we create a output array that has exactly same size as the CV data
from sklearn.metrics.classification import accuracy_score, log_loss
test_len = len(y_test)
predicted_y = np.zeros((test_len,2))
for i in range(test_len):
rand_probs = np.random.rand(1,2)
#https://www.geeksforgeeks.org/numpy-random-rand-python/
predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
print("Log loss on Test Data using Random Model",log_loss(y_test, predicted_y, eps=1e-15))
predicted_y =np.argmax(predicted_y, axis=1)
plot_confusion_matrix(y_test, predicted_y)
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
alpha = [10 ** x for x in range(-5, 4,)] # hyperparam for SGD classifier.
# read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# ------------------------------
# default parameters
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
# shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5,
# class_weight=None, warm_start=False, average=False, n_iter=None)
# some of methods
# fit(X, y[, coef_init, intercept_init, …]) Fit linear model with Stochastic Gradient Descent.
# predict(X) Predict class labels for samples in X.
#-------------------------------
# video link:
#------------------------------
log_error_array=[]
for i in alpha:
clf = SGDClassifier(alpha=i, penalty='l2', loss='log', class_weight='balanced',random_state=42)
clf.fit(X_tr_tfidf_w2v, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_tr_tfidf_w2v, y_train)
predict_y = sig_clf.predict_proba(X_test_tfidf_w2v)
log_error_array.append(log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
print('For values of alpha = ', i, "The log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
fig, ax = plt.subplots()
ax.plot(alpha, log_error_array,c='g')
for i, txt in enumerate(np.round(log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
best_alpha = np.argmin(log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', class_weight='balanced', random_state=42)
clf.fit(X_tr_tfidf_w2v, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_tr_tfidf_w2v, y_train)
predict_y = sig_clf.predict_proba(X_tr_tfidf_w2v)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_test_tfidf_w2v)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
predicted_y =np.argmax(predict_y,axis=1)
print("Total number of data points :", len(predicted_y))
plot_confusion_matrix(y_test, predicted_y)
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
alpha = [10 ** x for x in range(-5, 4,)] # hyperparam for SGD classifier.
# read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# ------------------------------
# default parameters
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
# shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5,
# class_weight=None, warm_start=False, average=False, n_iter=None)
# some of methods
# fit(X, y[, coef_init, intercept_init, …]) Fit linear model with Stochastic Gradient Descent.
# predict(X) Predict class labels for samples in X.
#-------------------------------
# video link:
#------------------------------
log_error_array=[]
for i in alpha:
clf = SGDClassifier(alpha=i, penalty='l2', loss='hinge', class_weight='balanced',random_state=42)
clf.fit(X_tr_tfidf_w2v, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_tr_tfidf_w2v, y_train)
predict_y = sig_clf.predict_proba(X_test_tfidf_w2v)
log_error_array.append(log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
print('For values of alpha = ', i, "The log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
fig, ax = plt.subplots()
ax.plot(alpha, log_error_array,c='g')
for i, txt in enumerate(np.round(log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
best_alpha = np.argmin(log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='hinge', class_weight='balanced', random_state=42)
clf.fit(X_tr_tfidf_w2v, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_tr_tfidf_w2v, y_train)
predict_y = sig_clf.predict_proba(X_tr_tfidf_w2v)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_test_tfidf_w2v)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
predicted_y =np.argmax(predict_y,axis=1)
print("Total number of data points :", len(predicted_y))
plot_confusion_matrix(y_test, predicted_y)
from scipy import sparse
import numpy as np
X_tr_tfidf_w2v= sparse.load_npz("X_tr_tfidf_w2v.npz")
X_test_tfidf_w2v = sparse.load_npz("X_test_tfidf_w2v.npz")
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')
#https://dask-ml.readthedocs.io/en/stable/modules/generated/dask_ml.xgboost.XGBClassifier.html
#https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
xgb = XGBClassifier()
parameters = {'n_estimators': [4, 8, 16, 32, 64],'max_depth': [4, 6, 8, 10, 12]}
clf1 = RandomizedSearchCV(xgb, parameters, cv=5, scoring='neg_log_loss',return_train_score=True,n_jobs=-1)
rs1 = clf1.fit(X_tr_tfidf_w2v, y_train)
df=pd.DataFrame(clf1.cv_results_)
df.head(2)
df.to_csv(r'HYP_w2v.csv')
df = pd.read_csv("HYP_w2v.csv")
%matplotlib inline
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
import numpy as np
def enable_plotly_in_cell():
import IPython
from plotly.offline import init_notebook_mode
display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
init_notebook_mode(connected=False)
# https://plot.ly/python/3d-axes/
trace1 = go.Scatter3d(x=df['param_n_estimators'],y=df['param_max_depth'],z=df['mean_train_score'], name = 'train')
trace2 = go.Scatter3d(x=df['param_n_estimators'],y=df['param_max_depth'],z=df['mean_test_score'], name = 'Cross validation')
data = [trace1, trace2]
enable_plotly_in_cell()
layout = go.Layout(scene = dict(
xaxis = dict(title='Estimators'),
yaxis = dict(title='Max_depth'),
zaxis = dict(title='Log-loss'),))
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='3d-scatter-colorscale')
print(clf1.best_estimator_)
best_parameters = {'n_estimators': [32],'max_depth': [4]}
import xgboost as xgb
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4
params['n_estimators'] = 32
d_train = xgb.DMatrix(X_tr_tfidf_w2v, label=y_train)
d_test = xgb.DMatrix(X_test_tfidf_w2v, label=y_test)
watchlist = [(d_train, 'train'), (d_test, 'valid')]
bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=20, verbose_eval=10)
xgdmat = xgb.DMatrix(X_tr_tfidf_w2v,y_train)
predict_y = bst.predict(d_test)
print("The test log loss is:",log_loss(y_test, predict_y,eps=1e-15))
predicted_y =np.array(predict_y>0.5,dtype=int)
print("Total number of data points :", len(predicted_y))
plot_confusion_matrix(y_test, predicted_y)
#Ref: http://zetcode.com/python/prettytable/
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Vectorizer","Models","Test Log-loss"]
x.add_row(["TFIDF", "Random model", 0.884])
x.add_row(["TFIDF", "Logistic regression", 0.470])
x.add_row(["TFIDF", "Linear SVM", 0.471])
x.add_row(["TFIDF", "XGBoost", 0.504])
x.add_row(["--------", "------------------", "------------------"])
x.add_row(["TFIDF W2V", "Random model", 0.884])
x.add_row(["TFIDF W2V", "Logistic regression", 0.557])
x.add_row(["TFIDF W2V", "Linear SVM", 0.560])
x.add_row(["TFIDF W2V", "XGBoost", 0.661])
print(x)